library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(stringr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.5
library(gsubfn)
## Warning: package 'gsubfn' was built under R version 4.0.5
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.0.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.5

Read in the data set

setwd("C:/Users/wduff/OneDrive/School/Harvard/Fall2021/BST260/BST_260_project")

injury <- read.csv("Data/all_injuries_clean.csv")
players = read.csv("Data/all_player_demographic_clean.csv")

# Merge data
injuries = left_join(injury, players, by = c("name", "team", "year", "full_team"))
head(injuries)
dim(injuries)
## [1] 17387    34
injuries[11:18]

Gathering into long format

injury_gather <- gather(injuries, key = "bodypart", value = "counts", 11:18)
injury_gather

EDA

Overall distribution of body part injuries

injury_gather %>% 
  group_by(bodypart) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
  geom_col() + 
  geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25, fontface = 'bold') +
  xlab("Body Part") +
  ylab("Count") +
  ggtitle("Distriubtion of Injuries by Body Part") +
  theme_economist() +
  theme(axis.title.x = element_text(size = 16, vjust = -3),
        axis.title.y = element_text(size = 16, vjust = 3),
        title = element_text(size = 20),
        plot.title = element_text(hjust = 0.5)
                )

Now for the next part at EDA, let’s look at the injury distributions across the various positions in football

levels(as.factor(injuries$position_id))
## [1] ""    "DEF" "K"   "OL"  "P"   "QB"  "RB"  "TE"  "WR"
injuries %>% filter(position_id == "K")

We see we have the following positions: Kicker (K), Offensive Line (OL), Punter (P), Quarter Back (QB), Running Back (RB), Tight End (TE), Wide Reciever (WR) and Defense (DEF).

Since Defense has it’s own category with no specific position (like Linebacker, Defensive Line or Safety), let’s first compare the injury distributions between Offensive Players and Defensive Players

offensive_position <- c("K", "OL", "P", "QB", "RB", "TE", "WR")

# Only offensive players
offense <- injury_gather %>% filter(position_id %in% offensive_position)
# Only defensive players
defense <- injury_gather %>% filter(position_id == "DEF") 

levels(as.factor(offense$position_id))
## [1] "K"  "OL" "P"  "QB" "RB" "TE" "WR"
levels(as.factor(defense$position_id))
## [1] "DEF"
dim(offense)
## [1] 61768    28
dim(defense)
## [1] 58568    28

We have 7712 offensive players and 72309 defensive players which is great since the data sets are somewhat balanced and therefore comparing them will be valid.

Overall distribution of offensive injuries

offense %>% 
  group_by(bodypart) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
  geom_col() +
  geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25, fontface = 'bold') +
  xlab("Body Part") +
  ylab("Count") +
  ggtitle("Distriubtion of Offensive Injuries") +
  theme_economist() +
  theme(axis.title.x = element_text(size = 16, vjust = -3),
        axis.title.y = element_text(size = 16, vjust = 3),
        title = element_text(size = 20),
        plot.title = element_text(hjust = 0.5)
                )

Overall distribution of defensive injuries

defense %>% 
  group_by(bodypart) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
  geom_col() +
  geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25, fontface = 'bold') +
  xlab("Body Part") +
  ylab("Count") +
  ggtitle("Distriubtion of Defensive Injuries") +
  theme_economist() +
  theme(axis.title.x = element_text(size = 16, vjust = -3),
        axis.title.y = element_text(size = 16, vjust = 3),
        title = element_text(size = 20),
        plot.title = element_text(hjust = 0.5)
                )

Distribution of injuries for kickers

offense %>% 
  filter(position_id == "K") %>%
  group_by(bodypart) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
  geom_text(aes(label = counts), position=position_dodge(width = 0.9), vjust = -0.25, fontface='bold') +
  geom_col() +
  xlab("Body Part") +
  ylab("Count") +
  ggtitle("Distribution of Offensive Injuries") +
  theme_economist() +
  theme(
    axis.title.x = element_text(size = 14, vjust = -3),
    axis.title.y = element_text(size = 14, vjust = 3),
    title = element_text(size = 18),
    plot.title = element_text(hjust = 0.5)
  )